Data Mining Final Project

Author

RAVLEEN KAUR CHADHA, MRIDULA KALAISELVAN, YASH SHARMA

if(!require(pacman)) 
  install.packages("pacman") 

devtools::install_github("tidyverse/dsbox") 
pacman::p_load(tidyverse, 
               scales, 
               devtools,
               here,
               class, 
               plotly,
               dplyr,
               caret,
               BiocManager,
               smotefamily,
               pROC, 
               mclust, 
               factoextra, 
               cluster) 
# Reading the dataset
hepatitis <- read_csv("HepatitisCdata.csv")
# Handling the NA values by replacing them by the median of each column
hepatitis <- hepatitis %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Verifying that all NA values have been handled
colSums(is.na(hepatitis))
    ...1 Category      Age      Sex      ALB      ALP      ALT      AST 
       0        0        0        0        0        0        0        0 
     BIL      CHE     CHOL     CREA      GGT     PROT 
       0        0        0        0        0        0 
write.csv(hepatitis, "HepatitisC_Cleaned.csv", row.names = FALSE)
# EDA for the dataset

# Defining a colorblind-friendly palette

color_palette <- c("#440154FF", "#3B528BFF", "#21908CFF", "#5DC863FF", "#FDE725FF")

# Creating an enhanced interactive scatterplot

p <- plot_ly(
  data = hepatitis,
  x = ~AST, 
  y = ~ALT, 
  type = 'scatter', 
  mode = 'markers',
  color = ~as.factor(Category),
  colors = color_palette,
  marker = list(
    size = 8,
    opacity = 0.8
  ),
  text = ~paste(
    "Category:", case_when(
      Category == 0 ~ "Blood Donor",
      Category == 1 ~ "Suspect Blood Donor",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis",
      TRUE ~ as.character(Category)
    ),
    "<br>AST:", AST,
    "<br>ALT:", ALT,
    "<br>Age:", Age
  )
) %>%
  layout(
    title = list(
      text = "Interactive Scatterplot: AST vs ALT by Category",
      font = list(size = 18)
    ),
    xaxis = list(
      title = "AST (Aspartate Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    yaxis = list(
      title = "ALT (Alanine Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    legend = list(
      title = list(text = "Category"),
      font = list(size = 12),
      orientation = "v",
      x = 1.1,
      y = 0.5,
      xanchor = "left"
    ),
    plot_bgcolor = "#FFFFFF",
    paper_bgcolor = "#FFFFFF"
  )

# Updating legend labels after plot creation

p <- p %>% layout(
  legend = list(
    title = list(text = "Category"),
    font = list(size = 12),
    orientation = "v",
    x = 1.1,
    y = 0.5,
    xanchor = "left",
    traceorder = "normal",
    itemsizing = "constant",
    labels = list(
      "0" = "Blood Donor",
      "1" = "Suspect Blood Donor",
      "2" = "Hepatitis",
      "3" = "Fibrosis",
      "4" = "Cirrhosis"
    )
  )
)

# Displaying the plot
p

Classification Analysis for the dataset Hepatitis-C

# Install and load required libraries
if (!requireNamespace("caret", quietly = TRUE)) {
  install.packages("caret")
}
if (!requireNamespace("themis", quietly = TRUE)) {
  install.packages("themis")
}
if (!requireNamespace("recipes", quietly = TRUE)) {
  install.packages("recipes")
}
if (!requireNamespace("pROC", quietly = TRUE)) {
  install.packages("pROC")
}

library(caret)
library(themis)
library(recipes)
library(pROC)

# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")

# Convert 'Category' into a binary variable (e.g., "Blood Donor" vs. others)
hepatitis <- hepatitis %>%
  mutate(BinaryCategory = ifelse(Category == "0=Blood Donor", 1, 0))

# Encode 'Sex' as a numeric variable (e.g., "m" -> 1, "f" -> 0)
hepatitis <- hepatitis %>%
  mutate(Sex = ifelse(Sex == "m", 1, 0))

# Exclude unnecessary columns
hepatitis <- hepatitis %>%
  select(-...1, -Category)  # Remove index and original 'Category'

# Ensure the target variable is a factor
hepatitis$BinaryCategory <- as.factor(hepatitis$BinaryCategory)

# Split the dataset into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(hepatitis$BinaryCategory, p = 0.8, list = FALSE)
train_data <- hepatitis[train_index, ]
test_data <- hepatitis[-train_index, ]

# Create a recipe for SMOTE
smote_recipe <- recipe(BinaryCategory ~ ., data = train_data) %>%
  step_smote(BinaryCategory, over_ratio = 1)  # Balance classes to a 1:1 ratio

# Prepare the SMOTE dataset
smote_data <- prep(smote_recipe) %>%
  bake(new_data = NULL)

# Check the class distribution after SMOTE
print("Class distribution after applying SMOTE:")
[1] "Class distribution after applying SMOTE:"
print(table(smote_data$BinaryCategory))

  0   1 
427 427 
# Train logistic regression on the SMOTE-balanced dataset
smote_logistic_model <- glm(BinaryCategory ~ ., data = smote_data, family = "binomial")

# Summarize the model
print("Logistic Regression Model Summary:")
[1] "Logistic Regression Model Summary:"
print(summary(smote_logistic_model))

Call:
glm(formula = BinaryCategory ~ ., family = "binomial", data = smote_data)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  6.246494   3.005821   2.078 0.037697 *  
Age          0.055334   0.019529   2.833 0.004605 ** 
Sex          1.277596   0.462124   2.765 0.005699 ** 
ALB          0.170787   0.051974   3.286 0.001016 ** 
ALP          0.091808   0.011753   7.812 5.65e-15 ***
ALT          0.033121   0.007777   4.259 2.06e-05 ***
AST         -0.163989   0.021844  -7.507 6.04e-14 ***
BIL         -0.084453   0.025373  -3.328 0.000873 ***
CHE         -0.239305   0.104772  -2.284 0.022368 *  
CHOL         0.646587   0.210365   3.074 0.002115 ** 
CREA        -0.024328   0.004495  -5.412 6.22e-08 ***
GGT         -0.038332   0.006404  -5.986 2.15e-09 ***
PROT        -0.182912   0.040763  -4.487 7.22e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1183.90  on 853  degrees of freedom
Residual deviance:  251.74  on 841  degrees of freedom
AIC: 277.74

Number of Fisher Scoring iterations: 9
# Predict on the test set
smote_predictions <- predict(smote_logistic_model, newdata = test_data, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
smote_predicted_classes <- ifelse(smote_predictions > 0.5, 1, 0)

# Evaluate the model
smote_confusion_matrix <- confusionMatrix(as.factor(smote_predicted_classes), as.factor(test_data$BinaryCategory))
print("Confusion Matrix after applying SMOTE:")
[1] "Confusion Matrix after applying SMOTE:"
print(smote_confusion_matrix)
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 14  7
         1  2 99
                                          
               Accuracy : 0.9262          
                 95% CI : (0.8646, 0.9657)
    No Information Rate : 0.8689          
    P-Value [Acc > NIR] : 0.03364         
                                          
                  Kappa : 0.7142          
                                          
 Mcnemar's Test P-Value : 0.18242         
                                          
            Sensitivity : 0.8750          
            Specificity : 0.9340          
         Pos Pred Value : 0.6667          
         Neg Pred Value : 0.9802          
             Prevalence : 0.1311          
         Detection Rate : 0.1148          
   Detection Prevalence : 0.1721          
      Balanced Accuracy : 0.9045          
                                          
       'Positive' Class : 0               
                                          
# Plot ROC curve for the SMOTE model
smote_roc_curve <- roc(as.numeric(test_data$BinaryCategory), smote_predictions)
plot(smote_roc_curve, col = "red", main = "ROC Curve for Logistic Regression (SMOTE)")

Interpretation of the classification - Logistic Regression

Understanding important terms before proceeding…

What is SMOTE? Tried to keep it as easy as possible…

Imagine you’re teaching a class where there are 90 boys and only 10 girls, and you’re trying to predict something about them (e.g., who will pass an exam). Since there are so many more boys than girls, most of your predictions might focus on boys, and you’d struggle to give fair attention to the girls. This is called class imbalance.

SMOTE is like a clever trick to “even the playing field.” It helps balance the boys and girls (or majority and minority groups in your data) by creating new, synthetic data points for the smaller group (girls, in this case) instead of just copying the existing ones.

How Does SMOTE Work? (Super Simple Example) Original Situation:

You have 10 girls (minority group). Instead of just copying the same girls, SMOTE creates “new girls” based on the ones you already have. These aren’t exact copies but are slightly adjusted versions that look realistic.

How It Creates New Data:

Imagine you know two girls, Alice and Bella, who both scored well on the last test. SMOTE “imagines” a new girl, “Cathy,” whose score is somewhere between Alice’s and Bella’s scores. Cathy is not real but synthetic and helps give the smaller group more representation. Result After SMOTE:

Now, instead of 90 boys and 10 girls, you might have 90 boys and 50 girls (original 10 plus 40 synthetic girls). This balanced data gives your model a fair chance to learn about both groups.

Sensitivity: How well you catch actual threats (e.g., detecting weapons). High sensitivity means you don’t miss any weapons.

Specificity: How well you avoid stopping harmless people (e.g., avoiding false alarms for innocent passengers). High specificity means you don’t wrongly suspect harmless people.

What Does This Visualization Show? This is a ROC Curve (Receiver Operating Characteristic Curve). It’s a graph that helps us understand how well our model is doing at separating two groups:

Blood Donors (Positive Class: 1) Non-Donors (Negative Class: 0) The red curve represents the performance of the model. It shows how good the model is at balancing two things:

Sensitivity (True Positives): How many “Blood Donors” the model correctly identifies. Specificity (True Negatives): How many “Non-Donors” the model correctly identifies. The diagonal gray line is a “coin toss” or random guess. If the red curve were close to this line, the model would be as good as random guessing.

What Question Does This Visualization Answer? “How well can this model distinguish between Blood Donors and Non-Donors?” It shows how accurately the model predicts who is a donor and who is not, across all possible thresholds (cut-off points for deciding whether someone is classified as a donor).

How to Interpret This Curve in Simple Terms The red curve is very close to the top-left corner.

Top-left corner means perfect performance: it catches all donors without falsely labeling “Non-Donors” as donors. Our model does an excellent job of separating the two groups. The steep rise near the beginning tells us:

The model identifies most “Blood Donors” (Sensitivity is high) with very few mistakes early on. High specificity and balanced accuracy:

The flat section at the top shows the model continues to correctly classify “Non-Donors” even as it adjusts thresholds. Overall, this curve shows the model is doing much better than guessing (red curve far above the diagonal line).

Why Is This Important?

This visualization helps answer:

“How reliable is this model?” The red curve being far from the gray line means the model is reliable.

“Does the model handle Blood Donors (minority class) well?” Yes, because the curve rises steeply, showing it catches “Blood Donors” early and accurately.

“Should we trust the model’s predictions?”

Yes, because the curve indicates that the model balances errors well between identifying donors and non-donors.

This graph tells us:

The model can distinguish between donors and non-donors accurately. It catches most donors (87.5%) while making very few mistakes. The model is better than random guessing and does a good job even with imbalanced data, thanks to SMOTE.

library(ggplot2)
test_data$Predicted_Probabilities <- smote_predictions
ggplot(test_data, aes(x = Predicted_Probabilities, fill = BinaryCategory)) +
  geom_density(alpha = 0.5) +
  labs(title = "Probability Distribution for Logistic Regression",
       x = "Predicted Probability",
       y = "Density",
       fill = "Actual Class") +
  theme_minimal()

coef_df <- as.data.frame(summary(smote_logistic_model)$coefficients)
coef_df <- coef_df[-1, ]  # Exclude intercept
coef_df$Feature <- rownames(coef_df)
ggplot(coef_df, aes(x = reorder(Feature, abs(Estimate)), y = Estimate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Feature Importance in Logistic Regression",
       x = "Feature",
       y = "Coefficient Value") +
  theme_minimal()

ggplot(smote_data, aes(x = ALT, y = AST, color = BinaryCategory)) +
  geom_point() +
  stat_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
  labs(title = "Decision Boundary of Logistic Regression",
       x = "ALT",
       y = "AST") +
  theme_minimal()

GMM Clustering

# Step 1: Load the dataset
# Assuming the data has been preprocessed (scaled, PCA applied, and features weighted)
hepatitis_data <- read.csv("HepatitisC_Cleaned.csv")

# Select numeric columns for clustering
numeric_data <- hepatitis_data[sapply(hepatitis_data, is.numeric)]
scaled_data <- scale(numeric_data)

# Perform PCA for dimensionality reduction
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- pca$x[, 1:2]  # Retain the first two principal components

# Step 2: Apply Gaussian Mixture Models (GMM)
set.seed(123)  # Ensure reproducibility
gmm_model <- Mclust(pca_data)

# Optimal number of clusters
cat("Optimal Number of Clusters (GMM):", gmm_model$G, "\n")
Optimal Number of Clusters (GMM): 2 
# Step 3: Evaluate Clustering Quality
# Silhouette Score
silhouette_scores <- silhouette(gmm_model$classification, dist(pca_data))
avg_silhouette <- mean(silhouette_scores[, 3])
cat("Average Silhouette Score (GMM):", round(avg_silhouette, 2), "\n")
Average Silhouette Score (GMM): 0.62 
# Adjusted Rand Index (ARI)
if ("Category" %in% colnames(hepatitis_data)) {
  actual_labels <- as.numeric(factor(hepatitis_data$Category))  # Replace 'Category' with your actual label column
  ari <- adjustedRandIndex(actual_labels, gmm_model$classification)
  cat("Adjusted Rand Index (ARI):", round(ari, 2), "\n")
} else {
  cat("No actual labels provided for ARI calculation.\n")
}
Adjusted Rand Index (ARI): 0.63 
# Step 5: Add Cluster Assignments to Dataset
hepatitis_data$Cluster <- as.factor(gmm_model$classification)

# Step 6: Analyze Clusters
# Summary statistics for each cluster
cluster_summary <- hepatitis_data %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))

# Display cluster summary
print(cluster_summary)
# A tibble: 2 × 13
  Cluster  ...1   Age   ALB   ALP   ALT   AST   BIL   CHE  CHOL  CREA   GGT
  <fct>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1        281.  46.6  42.4  66.5  26.1  28.0  8.79  8.46  5.46  78.7  29.2
2 2        545.  54.6  34.3  84.0  49.1  95.4 34.7   5.82  4.54 105.  132. 
# ℹ 1 more variable: PROT <dbl>
# Enhanced GMM Clustering Visualization with Labels
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(gmm_model$classification))) +
  geom_point(size = 3, alpha = 0.7) +  # Data points
  stat_ellipse(aes(fill = as.factor(gmm_model$classification)), alpha = 0.2, geom = "polygon") +  # Ellipses
  scale_color_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
                     labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  scale_fill_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
                    labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  labs(
    title = "Gaussian Mixture Model Clustering",
    subtitle = "Principal Components and Identified Groups",
    x = "Principal Component 1 (Explains 33.3% Variance)",
    y = "Principal Component 2 (Explains 20.5% Variance)"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 10), 
    axis.title.x = element_text(size = 9), 
    axis.title.y = element_text(size = 9)
  )